# Loading the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statistics
# Importing crime file (Rates of offences per thousand population)
cr_df = pd.read_excel('crime.xlsx', sheet_name = 'Crime Rates', header = [1])
# checking the head of data
cr_df.head()
# Removing the area code columns as it is not required for our analysis.
cr_df.drop('Code', axis=1, inplace=True)
# Checking for missing values
cr_df.isnull().sum
Observations: The data frame does not contain any missing values in the rows that we need for our analysis. There are missing values that are result of loading csv file in Python. Those values will be removed
# Removing missing values
cr_df.dropna(inplace=True)
# Checking the head of data
cr_df.head(10)
# Setting Borough name as index for future manipulations
cr_df.set_index('Borough',inplace = True )
# Checking data types
cr_df.dtypes
# Removing the last three rows as those areas are not part of our study
cf_df = cr_df.iloc[0:32,:]
cf_df.head()
Splitting the crime data frame in to separate data frames according to crime types for the purpose of future analysis
# Violence data frame
violence = cf_df.iloc[:, 18:36]
# Slightly changing the column names for readability
violence.columns = ['Year 1999', 'Year 2000', 'Year 2001', 'Year 2002', ' Year 2003', 'Year 2004', 'Year 2005', 'Year 2006', 'Year 2007', 'Year 2008', 'Year 2009', 'Year 2010', 'Year 2011', ' Year 2012', 'Year 2013', 'Year 2014', 'Year 2015', 'Year 2016']
# Checking the number of rows and columns
violence.shape
# Checking the head of data
violence.head()
# Checking for missing values
violence.isnull().sum
# Sex offences data frame
sex_offences= cf_df.iloc[:, 36:54]
# Slightly changing the column names for readability
sex_offences.columns = ['Year 1999', 'Year 2000', 'Year 2001', 'Year 2002', ' Year 2003', 'Year 2004', 'Year 2005', 'Year 2006', 'Year 2007', 'Year 2008', 'Year 2009', 'Year 2010', 'Year 2011', ' Year 2012', 'Year 2013', 'Year 2014', 'Year 2015', 'Year 2016']
sex_offences.head()
# Robbery data frame
robbery = cf_df.iloc[:, 54:72]
# Slightly changing the column names for readability
robbery.columns = ['Year 1999', 'Year 2000', 'Year 2001', 'Year 2002', ' Year 2003', 'Year 2004', 'Year 2005', 'Year 2006', 'Year 2007', 'Year 2008', 'Year 2009', 'Year 2010', 'Year 2011', ' Year 2012', 'Year 2013', 'Year 2014', 'Year 2015', 'Year 2016']
robbery.head()
# Burglary data frame
burglary = cf_df.iloc[:, 72:90]
# Slightly changing the column names for readability
burglary.columns = ['Year 1999', 'Year 2000', 'Year 2001', 'Year 2002', ' Year 2003', 'Year 2004', 'Year 2005', 'Year 2006', 'Year 2007', 'Year 2008', 'Year 2009', 'Year 2010', 'Year 2011', ' Year 2012', 'Year 2013', 'Year 2014', 'Year 2015', 'Year 2016']
burglary.head()
# Theft data frame
theft = cf_df.iloc[:, 90:108]
# Slightly changing the column names for readability
theft.columns = ['Year 1999', 'Year 2000', 'Year 2001', 'Year 2002', ' Year 2003', 'Year 2004', 'Year 2005', 'Year 2006', 'Year 2007', 'Year 2008', 'Year 2009', 'Year 2010', 'Year 2011', ' Year 2012', 'Year 2013', 'Year 2014', 'Year 2015', 'Year 2016']
theft.head()
# Fraud and Forgery data frame
fraud_forgery = cf_df.iloc[:, 108:126]
# Slightly changing the column names for readability
fraud_forgery.columns = ['Year 1999', 'Year 2000', 'Year 2001', 'Year 2002', ' Year 2003', 'Year 2004', 'Year 2005', 'Year 2006', 'Year 2007', 'Year 2008', 'Year 2009', 'Year 2010', 'Year 2011', ' Year 2012', 'Year 2013', 'Year 2014', 'Year 2015', 'Year 2016']
fraud_forgery.head()
# Criminal damage data frame
criminal_damage = cf_df.iloc[:, 126:144]
# Slightly changing the column names for readability
criminal_damage.columns = ['Year 1999', 'Year 2000', 'Year 2001', 'Year 2002', ' Year 2003', 'Year 2004', 'Year 2005', 'Year 2006', 'Year 2007', 'Year 2008', 'Year 2009', 'Year 2010', 'Year 2011', ' Year 2012', 'Year 2013', 'Year 2014', 'Year 2015', 'Year 2016']
criminal_damage.head()
# Drugs data frame
drugs = cf_df.iloc[:, 144:162]
# Slightly changing the column names for readability
drugs.columns = ['Year 1999', 'Year 2000', 'Year 2001', 'Year 2002', ' Year 2003', 'Year 2004', 'Year 2005', 'Year 2006', 'Year 2007', 'Year 2008', 'Year 2009', 'Year 2010', 'Year 2011', ' Year 2012', 'Year 2013', 'Year 2014', 'Year 2015', 'Year 2016']
drugs.head()
# Other offences data frame
other_offences = cf_df.iloc[:, 162:180]
# Slightly changing the column names for readability
other_offences.columns = ['Year 1999', 'Year 2000', 'Year 2001', 'Year 2002', ' Year 2003', 'Year 2004', 'Year 2005', 'Year 2006', 'Year 2007', 'Year 2008', 'Year 2009', 'Year 2010', 'Year 2011', ' Year 2012', 'Year 2013', 'Year 2014', 'Year 2015', 'Year 2016']
other_offences.head()
# All offences together (overall crime rate) data frame
all_offences = cf_df.iloc[:, 0:18]
all_offences.head()
# Slightly changing the column names for readability
all_offences.columns = ['Year 1999', 'Year 2000', 'Year 2001', 'Year 2002', ' Year 2003', 'Year 2004', 'Year 2005', 'Year 2006', 'Year 2007', 'Year 2008', 'Year 2009', 'Year 2010', 'Year 2011', ' Year 2012', 'Year 2013', 'Year 2014', 'Year 2015', 'Year 2016']
all_offences.head()
Investigating the change in overall crime rate (all offences) in London (by borough)
# Setting year as the index for the ‘all offenses’ data frame in order to convert it to a time series data frame.
# This will help with analysis of the trends in crime rate.
all_offences_time_series = all_offences.T
all_offences_time_series.index.name = 'Year'
all_offences_time_series.head()
# Lets see how all offences rate changed from Year 1999 to Year 2016.
x = all_offences["Year 1999"]
y = all_offences["Year 2016"]
print(np.corrcoef(x, y))
plt.scatter(x, y)
plt.xlabel('Year 1999')
plt.ylabel('Year 2016')
plt.title('All offences in Year 1999 and Year 2016')
Observations: We can see a strong positive correlation (0.97) between boroughs in trend to decrease in all offences rate. Also we can observe one obvious outlier.
# Converting to a datetime index. Even though the date might be not exactly the same as in original dataset, it covers exactly the same periods
rng = pd.date_range(start='01.01.1999', periods = 18, freq = 'Y' )
rng
all_offences_time_series.set_index(rng,inplace = True )
all_offences_time_series.head()
# Displaying the change in total crime rates by borough between years 1999 and 2016
ux = all_offences_time_series.plot(colormap='tab10', fontsize=10, linewidth=4, figsize=(20,12) )
# Set labels and legend
ux.set_xlabel('Year', fontsize=15)
ux.set_ylabel('Crime Rate', fontsize=15)
ux.set_title('Crime rate in London by borough (Years 1999-2016 )', fontsize=20)
ux.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=15)
Observations: As we can see on the above graph there is a trend to decrease in overall crime rate (all offences) for all boroughs. Visually it is seen that Westminster shows the highest crime rate during the whole period between 1999 and 2016. We can confidently say that Westminster in the outlier that we could see on the scatter plot above. The majority of boroughs do not significantly differ from each other.
# Finding what was the percentage change in all offences rate between years 1999 and 2016
change_1999_2016 = all_offences_time_series.pct_change(freq = '17Y' )
ch = change_1999_2016.iloc[17, :]
ch.head(32)
# Displaying the percentage change in crime rate between years 1999 and 2016
ch.plot(kind = 'bar', figsize=(10,10), title = ' Percentage change in crime rate between years 1999 and 2016')
plt.style.use('fivethirtyeight')
Observations: we can observe that Westminster and Camden had not only the highest crime rate in year 1999 but also had the biggest drop in crime by year 2016 ( 63% for Westminster and 57% for Camden). Havering demonstrated the lowest percentage drop in crime (30%). It seems that the higher was the initial crime rate the more significant was the crime rate drop between 1999 and 2016
# All offences in year 2009
all_offences1999 = all_offences.iloc[:, 0]
all_offences1999.head()
# All offences in year 2016
all_offences2016 = all_offences.iloc[:, 17]
all_offences2016.head()
# Violence Against the Person in year 1999
violence1999 = violence.iloc[:, 0]
violence1999.head()
# Violence Against the Person in year 2016
violence2016 = violence.iloc[:, 17]
violence2016.head()
# Sexual Offences in year 1999
sex_offences1999 = sex_offences.iloc[:, 0]
sex_offences1999.head()
# Sexual Offences in year 2016
sex_offences2016 = sex_offences.iloc[:, 17]
sex_offences2016.head()
# Robbery in year 1999
robbery1999 = robbery.iloc[:, 0]
robbery1999.head()
# Robbery in year 2016
robbery2016 = robbery.iloc[:, 17]
robbery2016.head()
# Burglary in year 1999
burglary1999 = burglary.iloc[:, 0]
burglary1999.head()
# Burglary in year 2016
burglary2016 = burglary.iloc[:, 17]
burglary2016.head()
# Theft and Handling in year 1999
theft1999 = theft.iloc[:, 0]
theft1999.head()
# Theft and Handling in year 2016
theft2016 = theft.iloc[:, 17]
theft2016.head()
# Fraud and Forgery in year 1999
fraud_forgery1999 = fraud_forgery.iloc[:, 0]
fraud_forgery1999.head()
# Fraud and Forgery in year 2016
fraud_forgery2016 = fraud_forgery.iloc[:, 17]
fraud_forgery2016.head()
# Criminal Damage in year 1999
criminal_damage1999 = criminal_damage.iloc[:, 0]
criminal_damage1999.head()
# Criminal Damage in year 2016
criminal_damage2016 = criminal_damage.iloc[:, 17]
criminal_damage2016.head()
# Drugs in year 1999
drugs1999 = drugs.iloc[:, 0]
drugs1999.head()
# Drugs in year 2016
drugs2016 = drugs.iloc[:, 17]
drugs2016.head()
# Other offences in year 1999
other_offences1999 = other_offences.iloc[:, 0]
other_offences1999.head()
# Other offences in year 2016
other_offences2016 = other_offences.iloc[:, 17]
other_offences2016.head()
# Combining all crime types in year 1999
crime_by_type_1999 = pd.concat([violence1999, sex_offences1999, robbery1999, burglary1999, theft1999, fraud_forgery1999, criminal_damage1999, drugs1999, other_offences1999], keys = ['Violence Against the Person', 'Sexual Offences','Robbery','Burglary','Theft and Handling','Fraud or Forgery','Criminal Damage', 'Drugs','Other Notifiable Offences'], axis = 1)
crime_by_type_1999.head()
crime_by_type_1999.shape
# Combining all crime types in year 2016
crime_by_type_2016 = pd.concat([violence2016, sex_offences2016, robbery2016, burglary2016, theft2016, fraud_forgery2016, criminal_damage2016, drugs2016, other_offences2016], keys = ['Violence Against the Person', 'Sexual Offences','Robbery','Burglary','Theft and Handling','Fraud or Forgery','Criminal Damage', 'Drugs','Other Notifiable Offences'], axis = 1)
crime_by_type_2016.head()
# Plotting all crimes to see the proportional division of types of crime in year 1999
crime_by_type_1999.plot.bar( stacked = True, edgecolor='black', figsize = (13,13), title = 'Crime Rate By Type In 1999' )
plt.legend(fontsize = 12)
plt.ylabel ('Crime Rate')
plt.xticks(fontsize = 10)
Observations: As we can see on the above graph the biggest part of all offences is accounted to Theft and Handling crime type in year 1999.
#Plotting all crimes to see the proportional division of types of crime in year 2016
crime_by_type_2016.plot.bar( stacked = True, edgecolor='black', figsize = (13,13), title = 'Crime Rate By Type In 2016' )
plt.legend(fontsize = 12)
plt.ylabel ('Crime Rate')
plt.xticks(fontsize = 10)
Observations: As demonstrated above in year 2016 the biggest part of all offences for majority of boroughs is still Theft and Handling however the proportion of it to other crime types is not as high as before. We can see a big increase in violence against the person crime type as a proportion of all crimes. The change in the rest of crime types is not very visible
# Checking the proportion of violence against the person in all offences in year 1999
proportion_of_violence1999 = (violence1999 /all_offences1999) * 100
print(proportion_of_violence1999)
# Plotting proportion of violence against the person in all offences in year 1999
proportion_of_violence1999.plot.bar(title = 'Proportion of violence against the person in all offences (year 1999)' )
# Average proportion of violence against the person in all offences among all boroughs in year 1999
print(statistics.mean(proportion_of_violence1999))
# Proportion of violence against the person in all offences in year 2016
proportion_of_violence2016 = (violence2016/all_offences2016) * 100
print(proportion_of_violence2016)
# Plotting proportion of violence against the person in all offences in year 1999
proportion_of_violence2016.plot.bar(title = 'Proportion of violence against the person in all offences (year 2016)')
# Average proportion of violence against the person in all offences among all boroughs in year 2016
print(statistics.mean(proportion_of_violence2016))
Observations: The proportion of violence against the person in all offences has gone dramatically up between years 1999 and 2016.
Now lets see how the violence against the person changed from year to year in the period between 1999 and 2016
# Setting year as the index for the ‘violence’ data frame in order to convert it to a time series data frame. This will help with analysis of the trends in violence rate.
violence_time_series = violence.T
violence_time_series.head()
# Converting to a datetime index. Even though the date might be not exactly the same as in original data set, it covers exactly the same periods
violence_time_series.index.name = 'Year'
rng_v = pd.date_range(start='01.01.1999', periods = 18, freq = 'Y' )
rng_v
violence_time_series.set_index(rng_v,inplace = True )
violence_time_series.head()
# Displaying the change in Violence against the person rate by borough between years 1999 and 2016
uv = violence_time_series.plot(colormap='tab10', fontsize=10, linewidth=4, figsize=(20,12) )
# Set labels and legend
uv.set_xlabel('Year', fontsize=15)
uv.set_ylabel('Violence Rate', fontsize=15)
uv.set_title('Violence against the person rate in London by borough (Years 1999-2016 )', fontsize=20)
uv.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=15)
The above graph looks messy and it is difficult to see any trends. Lets separate it in to separate graphs for a clear picture
uvv = violence_time_series.plot(subplots=True,
layout=(6, 6),
sharex=True,
sharey=False,
linewidth=0.7,
fontsize=3,
figsize=(15,15),
legend=True)
plt.show()
Observations: as per above time series it is very difficult to see any trends in change in violence. There are a lot of ups and downs year by year. It shows a more complex behaviour than 'all offences' patterns. Interestingly for most of boroughs there was a drop in violence around year 2014 and then sharp rise between 2014 and 2016
Instead of concentrating on each borough lets compare how average violence rate changed in comparison to average all offences rate in London.
# Finding and plotting London average all offences rate year-on-year.
plt.bar(height = np.mean(all_offences), x = all_offences.columns )
plt.xticks(rotation=90)
plt.title('Average all offences rate in London by year ')
Observation: As previously we can see a clear trend to decrease in all offences
# Finding and plotting London average violence rate year-on-year.
plt.bar(height = np.mean(violence), x = violence.columns )
plt.xticks(rotation=90)
plt.title('Average violence rate in London by year ')
Observations: Even though average violence rate changes year to year in the long run it stays approximately the same.
# Fitting simple linear regression model at a year-on-year basis to test the the change in all offences rates.
# We use OLS methodology to fit these models.
a_coefs = [0]*17
for year_index in range(17):
X = all_offences.iloc[:,year_index]
y = all_offences.iloc[:,year_index + 1]
model = sm.OLS(y, X).fit()
#print(model.summary())
print((model.params)[0])
print((model.pvalues)[0])
print('\t')
a_coefs[year_index] = (model.params)[0]
np.mean(a_coefs)
Observations: The vast majority of fitted coefficients are clearly less than one which implies a decreasing all offences rate. This conclusion is further validated by the highly significant p-values which are all < 0.0001
# Fitting simple linear regression models at a year-on-year basis to test the the change in violence rates.
# We use OLS methodology to fit these models.
v_coefs = [0]*17
for year_index in range(17):
X = violence.iloc[:,year_index]
y = violence.iloc[:,year_index + 1]
model = sm.OLS(y, X).fit()
#print(model.summary())
print((model.params)[0])
print((model.pvalues)[0])
print('\t')
v_coefs[year_index] = (model.params)[0]
np.mean(v_coefs)
Observations: Some of the coefficients are < 1 and some > 1 which means that there is mix of increase and decrease in violence rate year-to-year. However the average coefficient is a bit more than one so we can conclude that there was a slight increase in violene rate betweet years 1999 and 2016.
# Difference in all offences coefficients and violence against the person coefficients
diffs = [a_coefs[i] - v_coefs[i] for i in range(len(v_coefs))]
plt.bar(height = diffs, x = all_offences.columns[0:17])
plt.xticks(rotation=90)
Loading datasets to extract the required socioeconomic factors
# Loading pubs and bars dataset
pubs_bars = pd.read_excel('pubs.xlsx', sheet_name = 'Pubs units', header = [3])
pubs_bars.head()
# The data frame does not contain any missing values in the rows that we need for our analysis. There are missing values that are result of loading csv file in Python. Those values will be removed
pubs_bars.isnull().sum
# Removing the missing values.
pubs_bars.dropna(inplace=True)
# Doing all the necessary transformations.
pubs_bars.drop('Unnamed: 0', axis=1, inplace=True)
pubs_bars.set_index('Unnamed: 1',inplace = True )
pubs_bars.index.name = 'Borough'
pubs_bars.head()
pubs_bars.shape
pubs_bars.head()
# Loading licensed restaurants dataset
licensed_restaurants = pd.read_excel('licensed-restaurants-cafes-borough.xlsx', sheet_name = 'Restaurants units', header = [3])
licensed_restaurants.head()
# Doing all the necessary transformations as per ‘pubs_bars’ data frame
licensed_restaurants.drop('Unnamed: 0', axis=1, inplace=True)
licensed_restaurants.set_index('Unnamed: 1',inplace = True )
licensed_restaurants.index.name = 'Borough'
# The data frame does not contain any missing values in the rows that we need for our analysis. There are missing values that are result of loading csv file in Python. Those values will be removed
licensed_restaurants.isnull()
# Those rows that we need to use do not have any missing values. We are removing the rest rows with missing values.
licensed_restaurants.dropna(inplace=True)
licensed_restaurants.head()
licensed_restaurants.shape
# Loading unlicensed restaurants dataset
unlicensed_restaurants = pd.read_excel('unlicensed-restaurants.xlsx', sheet_name = 'Unlicensed Restaurants units', header = [3])
# Doing all the necessary transformations as per ‘pubs_bars’ data frame
unlicensed_restaurants.drop('Unnamed: 0', axis=1, inplace=True)
unlicensed_restaurants.dropna(inplace=True)
unlicensed_restaurants.set_index('Unnamed: 1',inplace = True )
unlicensed_restaurants.index.name = 'Borough'
unlicensed_restaurants.head()
unlicensed_restaurants.shape
## Loading take away restaurants dataset
take_away = pd.read_excel('take.away.xlsx', sheet_name = 'Takeaway and food stand units', header = [3])
take_away.head()
# Doing all the necessary transformations as per ‘pubs_bars’ data frame
take_away.drop('Unnamed: 0', axis=1, inplace=True)
take_away.dropna(inplace=True)
take_away.set_index('Unnamed: 1',inplace = True )
take_away.index.name = 'Borough'
take_away.head()
take_away.shape
# Loading clubs dataset
licensed_clubs = pd.read_excel('licensed-clubs-borough.xlsx', sheet_name = 'Clubs units', header = [3])
licensed_clubs.head()
# Doing all the necessary transformations as per ‘pubs_bars’ data frame
licensed_clubs.drop('Unnamed: 0', axis=1, inplace=True)
licensed_clubs.dropna(inplace=True)
licensed_clubs.set_index('Unnamed: 1',inplace = True )
licensed_clubs.index.name = 'Borough'
licensed_clubs.head()
licensed_clubs.shape
# All restaurant businesses in the borough
all_food_alcohol_places = pubs_bars + licensed_restaurants + unlicensed_restaurants + take_away + licensed_clubs
all_food_alcohol_places.head()
# Proportion of pubs and bars to all food/drink businesses
proportion_of_pubs_bars = (pubs_bars/all_food_alcohol_places) * 100
proportion_of_pubs_bars.head()
# Removing the rows and columns that are not needed for our analysis
p_b= proportion_of_pubs_bars.iloc[2:34,1:16]
p_b.reset_index(inplace =True)
p_b.head()
p_b.shape
# Melting data frame for the future concating with other data frames
pubs_bars_new = pd.melt(p_b, id_vars = ['Borough'], var_name = 'Year', value_name = 'Proportion of pubs/bars')
pubs_bars_new.head(35)
pubs_bars_new.shape
# Proportion of clubs to all food/drink businesses
proportion_of_clubs = (licensed_clubs/all_food_alcohol_places) * 100
proportion_of_clubs.head()
# Removing the rows and columns that are not needed for our analysis
clubs= proportion_of_clubs.iloc[2:34,1:16]
clubs.reset_index(inplace=True)
clubs.head()
clubs.shape
#Melting data frame for the future concating with other data frames
clubs_new = pd.melt(clubs, id_vars = ['Borough'], var_name = 'Year', value_name = 'Proportion of clubs')
clubs_new.head()
# Importing Job Seekers Allowance Claimants file (Rates are as a percentage of all people working age from ONS mid-year estimates)
job_seekers_df = pd.read_excel('job-seekers-allowance-borough.xls', sheet_name = 'Rates')
job_seekers_df.head()
# Removing the rows and columns that are not needed for our analysis. Since 4 sets of figures per year are given to make it consistent with other data frames we only keep figures for November of each year
job_seekers = job_seekers_df.iloc[2:34,[1,15,19,23,27,31,35,39,43,47,51,55,59,63,67,71]]
job_seekers.head()
# Slightly changing the column names for readability
job_seekers.columns = ['Borough','2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']
job_seekers.head()
job_seekers.reset_index(drop =True)
# Checking for missing values
job_seekers.isnull().sum
#Melting data frame for the future concating with other data frames
job_seekers_new = pd.melt(job_seekers, id_vars = ['Borough'], var_name = 'Year', value_name = 'Job Seekers Allowance Claimants (Rate)')
job_seekers_new.head()
job_seekers_new.shape
# Importing National Insurance Number Registrations (Immigration) file (Rates per 1,000 working age population)
immigration_df = pd.read_excel('national-insurance-number-registrations.xls', sheet_name = 'Borough Summary', header = [1])
immigration_df.head()
# Doing necessary transformations to make it consistent with other data frames
immigration_df.set_index('Area',inplace = True )
immigration_df.index.name = 'Borough'
immigration_df.head()
# Doing necessary transformations to make it consistent with other data frames
immigration = immigration_df.iloc[2:34,19:34]
# Resetting index
immigration.reset_index( inplace = True)
# Slightly changing the column names for readability
immigration.columns = ['Borough','2002', ' 2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', ' 2012', '2013', '2014', '2015', '2016']
immigration.head()
#Melting data frame for the future concating with other data frames
immigration_new = pd.melt(immigration, id_vars = ['Borough'], var_name = 'Year', value_name = 'National Insurance Number Registrations (Immigration)(Rate)')
immigration_new.head()
# Checking for missing values
immigration_new.isnull().sum
immigration_new.shape
# Importing Income Support Claimants file (Rates are as a percentage of all people aged 16-64 from ONS mid-year estimates)
income_support_df = pd.read_excel('income-support-borough.xls', sheet_name = 'Rates')
income_support_df.head()
# Doing necessary transformations to make it consistent with other data frames
income_support_df.drop('Code', axis=1, inplace=True)
income_support_df.set_index('Area',inplace = True )
income_support_df.index.name = 'Borough'
income_support_df.head()
# Removing the rows and columns that are not needed for our analysis. Since 4 sets figures per year are given to make it consistent with other data frames we only keep figures for November of each year
income_support = income_support_df.iloc[2:34,[13,17,21,25,29,33,37,41,45,49,53,57,61,65,69]]
income_support.columns = ['2002', ' 2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', ' 2012', '2013', '2014', '2015', '2016']
# Resetting index
income_support.reset_index(inplace = True)
income_support.head()
#Melting data frame for the future concating with other data frames
income_support_new = pd.melt(income_support, id_vars = ['Borough'], var_name = 'Year', value_name = 'Income Support Claimants (Rate)')
income_support_new.head()
income_support_new.shape
# Importing DCLG Affordable Housing Supply file (Total affordable housing completions by financial year)
affordable_housing_df = pd.read_excel('dclg-affordable-housing-borough.xlsx', sheet_name = 'data')
affordable_housing_df.head()
# Doing necessary transformations to make it consistent with other data frames
affordable_housing = affordable_housing_df.iloc[2:34,[2,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28]]
affordable_housing.columns = ['Borough','2002', ' 2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', ' 2012', '2013', ' 2014', '2015', '2016']
affordable_housing.head()
# Resetting index
affordable_housing.reset_index(drop = True, inplace= True)
affordable_housing.head()
# Melting data frame for the future concating with other data frames
affordable_housing_new = pd.melt(affordable_housing, id_vars = ['Borough'], var_name = 'Year', value_name = 'Affordable Housing Supply')
affordable_housing_new.head()
# Checking for missing values
affordable_housing_new.isnull().sum
affordable_housing.shape
# Importing Ratio of House Prices to Earnings file (Median House Prices to Residence-Based Earnings)
House_Prices_to_Earnings_df = pd.read_excel('ratio-house-price-earnings-residence-based.xls', sheet_name = 'Median Earnings to Prices ratio')
House_Prices_to_Earnings_df.head()
# Doing necessary transformations to make it consistent with other data frames
House_Prices_to_Earnings = House_Prices_to_Earnings_df.iloc[2:34,2:18]
# Resetting index
House_Prices_to_Earnings.reset_index(drop = True, inplace = True)
# Doing necessary transformations to make it consistent with other data frames
House_Prices_to_Earnings.columns = ['Borough','2002', ' 2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']
#Melting data frame for the future concating with other data frames
House_Prices_to_Earnings_new = pd.melt(House_Prices_to_Earnings, id_vars = ['Borough'], var_name = 'Year', value_name = 'Ratio of House Prices to Earnings')
House_Prices_to_Earnings_new.head()
# Checking for missing values
House_Prices_to_Earnings_new.isnull().sum
House_Prices_to_Earnings.shape
# Importing Personal Insolvency and bankruptcy Statistics file for each year (Rates per 10,000 adult population)
Insolvency2002 = pd.read_excel('personal-insolvency.xls', sheet_name = '2002')
Insolvency2003 = pd.read_excel('personal-insolvency.xls', sheet_name = '2003')
Insolvency2004 = pd.read_excel('personal-insolvency.xls', sheet_name = '2004')
Insolvency2005 = pd.read_excel('personal-insolvency.xls', sheet_name = '2005')
Insolvency2006 = pd.read_excel('personal-insolvency.xls', sheet_name = '2006')
Insolvency2007 = pd.read_excel('personal-insolvency.xls', sheet_name = '2007')
Insolvency2008 = pd.read_excel('personal-insolvency.xls', sheet_name = '2008')
Insolvency2009 = pd.read_excel('personal-insolvency.xls', sheet_name = '2009')
Insolvency2010 = pd.read_excel('personal-insolvency.xls', sheet_name = '2010')
Insolvency2011 = pd.read_excel('personal-insolvency.xls', sheet_name = '2011')
Insolvency2012 = pd.read_excel('personal-insolvency.xls', sheet_name = '2012')
Insolvency2013 = pd.read_excel('personal-insolvency.xls', sheet_name = '2013')
Insolvency2014 = pd.read_excel('personal-insolvency.xls', sheet_name = '2014')
Insolvency2015 = pd.read_excel('personal-insolvency.xls', sheet_name = '2015')
Insolvency2016 = pd.read_excel('personal-insolvency.xls', sheet_name = '2016')
insolvency_df = pd.concat([Insolvency2002, Insolvency2003, Insolvency2004, Insolvency2005, Insolvency2006, Insolvency2007, Insolvency2008,Insolvency2009, Insolvency2010, Insolvency2011, Insolvency2012, Insolvency2013, Insolvency2014, Insolvency2015, Insolvency2016], axis=1)
insolvency_df.head()
# Doing necessary transformations to make it consistent with other data frames
insolvency = insolvency_df.iloc[2:34,[1,3,11,19,27,35,43,51,59,69,79,89,99,109,119,129]]
insolvency.head()
# Resetting index
insolvency.reset_index(drop = True, inplace = True)
# Doing necessary transformations to make it consistent with other data frames
insolvency.columns = ['Borough', '2002', ' 2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', ' 2012', '2013', '2014', '2015', '2016']
#Melting data frame for the future concating with other data frames
insolvency_new = pd.melt(insolvency, id_vars = ['Borough'], var_name = 'Year', value_name = 'Personal Insolvency (Rate)')
insolvency_new.head()
insolvency_new.isnull().sum
# Doing necessary transformations to make it consistent with other data frames
bankruptcy = insolvency_df.iloc[2:34,[1,5,13,21,29,37,45,53,61,71,81,91,101,111,121,131]]
bankruptcy.head()
# Resetting index
bankruptcy.reset_index(drop = True, inplace = True)
# Doing necessary transformations to make it consistent with other data frames
bankruptcy.columns = ['Borough','2002', ' 2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']
#Melting data frame for the future concating with other data frames
bankruptcy_new = pd.melt(bankruptcy, id_vars = ['Borough'], var_name = 'Year', value_name = 'Personal Bankruptcy (Rate)')
bankruptcy_new.head()
# Resetting index
violence.reset_index(inplace = True)
#Melting data frame for the future concating with other data frames
violence_new = pd.melt(violence, id_vars = ['Borough'], var_name = 'Year', value_name = 'Violence Rate')
# Years 1999,2000 and 2001 are removed to make it consistent with other data frames
violence_amended_years = violence_new.iloc[96 :,:]
# Resetting index
violence_amended_years.reset_index(drop = True, inplace = True)
violence_amended_years.head(20)
violence_amended_years.shape
# Dropping columns that are not needed for concating
pubs_bars_amended = pubs_bars_new.drop(['Borough','Year'], axis = 'columns')
pubs_bars_amended.head(300)
clubs_new.head(300)
# Dropping columns that are not needed for concating
clubs_amended = clubs_new.drop(['Borough','Year'], axis = 'columns')
clubs_amended.head()
# Dropping columns that are not needed for concating
job_seekers_amended = job_seekers_new.drop(['Borough','Year'], axis = 'columns')
job_seekers_amended.head()
# Dropping columns that are not needed for concating
immigration_amended = immigration_new.drop(['Borough','Year'], axis = 'columns')
immigration_amended.head()
# Dropping columns that are not needed for concating
income_support_amended = income_support_new.drop(['Borough','Year'], axis = 'columns')
income_support_amended.head()
# Dropping columns that are not needed for concating
affordable_housing_amended = affordable_housing_new.drop(['Borough','Year'], axis = 'columns')
affordable_housing_amended.head()
# Dropping columns that are not needed for concating
House_Prices_to_Earnings_amended = House_Prices_to_Earnings_new.drop(['Borough','Year'], axis = 'columns')
House_Prices_to_Earnings_amended.head()
# Dropping columns that are not needed for concating
insolvency_amended = insolvency_new.drop(['Borough','Year'], axis = 'columns')
insolvency_amended.head()
# Dropping columns that are not needed for concating
bankruptcy_amended = bankruptcy_new.drop(['Borough','Year'], axis = 'columns')
# Concating all data frames together to run a multi regression analysis
combined = pd.concat([violence_amended_years, job_seekers_amended, immigration_amended, income_support_amended,affordable_housing_amended, House_Prices_to_Earnings_amended, insolvency_amended, bankruptcy_amended,pubs_bars_amended,clubs_amended], axis = 1 )
combined.head(300)
# Checking the shape of a new data frame
combined.shape
combined.isnull().sum
# checking data types
combined.dtypes
# Converting object type to float so regression models can be build
combined = combined.astype({'Ratio of House Prices to Earnings': 'float64'})
combined = combined.astype({'Personal Insolvency (Rate)': 'float64'})
combined = combined.astype({'Personal Bankruptcy (Rate)': 'float64'})
To test the relationship between the violence rate and the socioeconomic factors we are going to build multi regression models. As observed before the Westminster borough was very different to all other boroughs in terms of a very high crime rate. So first of all we will test the models with all the boroughs and then we will exclude Westminster to see if it has any effect on model performance.
# Checking the varibles for multicollinearity.In case if we find a strong correlation between some predictors it would be difficult to distinguish the effect of those predictors on models
corr = combined.corr()
display(corr)
sns.heatmap(corr, cmap = 'RdBu')
Observations: As we can see above there is no strong correlation between the predictor factors so no variables need to be removed on that basis. Also it is seen that even though some of the predictors show some correlation with violence rate majority of the predictors demonstrate a very week correlation.
# Plotting the correlation
pd.plotting.scatter_matrix (combined, alpha = 1 , figsize = (30,30), grid = True)
plt.show()
# Describing the data
combined.describe()
# Test normality for predictor factors (columns 2:11)
for col in range(2,12):
print(combined.columns[col])
X = combined.iloc[:,col]
# Perform the Shapiro-Wilk normality test. Lower p-value implies more closer to normality.
print(stats.shapiro(X)[1])
Observations: In the cell above, we test the normality of each variable using the Shapiro-Wilk test for normality. A lower P-values (generally <0.05) imply that the variables are not normally distributed, which is the case with our chosen factors. However it doesn't mean that we cannot build valid models as sometimes the relevant variables may not exhibit normality. What is important after running a model is to check for normality the residuals which will be done below.
# Semi-saturated model(6 out of 9 predictors):firstly lets test the model not including all the variables so later we can see if adding a predictor would have influence on the model
X = combined[ ['Job Seekers Allowance Claimants (Rate)',
'National Insurance Number Registrations (Immigration)(Rate)',
'Income Support Claimants (Rate)', 'Affordable Housing Supply', 'Proportion of pubs/bars','Proportion of clubs']]
X = sm.add_constant(X)
y = combined.iloc[:,2]
model = sm.OLS(y, X).fit()
print(model.summary())
# Visualising residual distribution
mu = np.mean(model.resid)
sigma = np.std(model.resid)
pdf = stats.norm.pdf(sorted(model.resid), mu, sigma)
plt.hist(model.resid, bins=50, normed=True)
plt.plot(sorted(model.resid), pdf, linewidth=2)
plt.show()
Observations: In the above model, the following factors are highly significant (< 0.05) with strong evidence of their impact on the violence rate: 'Job Seekers Allowance Claimants (Rate) National Insurance Number Registrations (Immigration)(Rate) Income Support Claimants (Rate) Proportion of clubs' After the fitting of the model, the following two variables where shown to be insignificant in explaining the variance in violent crime rates: 'Proportion of pubs/bars ', 'Affordable Housing Supply'. However it is important to mention that we cannot conclude that those two factors are always not significat for determinig violence rate. It might be the case that they are not significat in this particular model because of the presence of other factors such as 'tional Insurance Number Registrations (Immigration)(Rate)'. The fact that R-sq is heigher than 40 % means that more than 40% of variation in violent crime rate can be explained by the model, which is impressive given that not all predictors are included. We can also observe that residuals are quite well normally distributed which is a good sign that fitted model is valid.
# Fully saturated model. All predictor factors
X = combined[ ['Job Seekers Allowance Claimants (Rate)',
'National Insurance Number Registrations (Immigration)(Rate)',
'Income Support Claimants (Rate)', 'Affordable Housing Supply', 'Proportion of pubs/bars','Proportion of clubs', 'Ratio of House Prices to Earnings',
'Personal Insolvency (Rate)', 'Personal Bankruptcy (Rate)']]
X = sm.add_constant(X)
y = combined.iloc[:,2]
model1 = sm.OLS(y, X).fit()
print(model1.summary())
# Visualising residual distribution
mu1 = np.mean(model1.resid)
sigma1 = np.std(model1.resid)
pdf1 = stats.norm.pdf(sorted(model1.resid), mu1, sigma1)
plt.hist(model1.resid, bins=50, normed=True)
plt.plot(sorted(model1.resid), pdf1, linewidth=2)
plt.show()
Observations: It is interesting to see that some of the factors that before were shown to be not significant for violence in fully saturated model became significant whereas some predictors that were significant became insignificant (e.g. proportion of clubs). This proves the point that significance of the predictor sometimes depends on the presence of other predictors in the model. Since more predictors are included in the above model it was expected for R-sq to be higher. In this case R-sq shows that we can explain over half the variance in violent crime rates using the listed factors above. However Adj. R-sq ( > 50 %) has also gone up, which means that additional factors improves the performance of the model. As in the other model we can observe that residuals are quite well normally distributed which means that we can draw a valid conclusion from it.
Now Lets remove Westminster as 'outlier' from our analysis to see if it has any effect on the regression models
# Creating new data frame without Westminster
combined_w = combined[combined['Borough'] != 'Westminster']
# Converting object type to float so regression models can be build
combined_w = combined_w.astype({'Ratio of House Prices to Earnings': 'float64'})
combined_w = combined_w.astype({'Personal Insolvency (Rate)': 'float64'})
combined_w = combined_w.astype({'Personal Bankruptcy (Rate)': 'float64'})
combined_w.head(33)
# Resetting Index
combined_w.reset_index(drop= True,inplace = True)
# Semi-saturated model(6 out of 9 predictors):firstly lets test the model not including all the variables so later we can see if adding a predictor would have influence on the model
X = combined_w[ ['Job Seekers Allowance Claimants (Rate)',
'National Insurance Number Registrations (Immigration)(Rate)',
'Income Support Claimants (Rate)', 'Affordable Housing Supply', 'Proportion of pubs/bars','Proportion of clubs']]
X = sm.add_constant(X)
y = combined_w.iloc[:,2]
model2 = sm.OLS(y, X).fit()
print(model2.summary())
# Fully saturated model.
X = combined_w[ ['Job Seekers Allowance Claimants (Rate)',
'National Insurance Number Registrations (Immigration)(Rate)',
'Income Support Claimants (Rate)', 'Affordable Housing Supply', 'Proportion of pubs/bars','Proportion of clubs', 'Ratio of House Prices to Earnings',
'Personal Insolvency (Rate)', 'Personal Bankruptcy (Rate)']]
X = sm.add_constant(X)
y = combined_w.iloc[:,2]
model3 = sm.OLS(y, X).fit()
print(model3.summary())
Observations. The removal of Westminster has improved the performance of the both semi-saturated (R-sq 49%) and saturated ((R-sq 54%) models to some extent but did not have a dramatic effect on the models in comparison to when all boroughs were present.